{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T11:45:48.247763",
     "start_time": "2017-04-28T11:45:48.226761"
    }
   },
   "outputs": [],
   "source": [
    "# Basic libraries import\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import collections\n",
    "import itertools\n",
    "import random\n",
    "\n",
    "import sys\n",
    "import os\n",
    "from os.path import join\n",
    "from pathlib import Path\n",
    "\n",
    "# Plotting\n",
    "%matplotlib notebook\n",
    "%matplotlib inline\n",
    "\n",
    "sns.set_context(\"notebook\", font_scale=1.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_folder = join(str(Path.home()), \"Documents/datasets/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Intro\n",
    "This notebook explores words embeddings.\n",
    "\n",
    "Includes playing around with Word2Vec using Gensim and exploration of GloVe pretained embeddings.\n",
    "\n",
    "**Resources**\n",
    "\n",
    "* https://rare-technologies.com/word2vec-tutorial/\n",
    "* https://radimrehurek.com/gensim/models/word2vec.html\n",
    "* https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html\n",
    "* https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T11:45:19.826137",
     "start_time": "2017-04-28T11:45:19.816136"
    }
   },
   "outputs": [],
   "source": [
    "sentences = [\"A brown fox jumped on the lazy dog\", \n",
    "            \"A brown fox jumped on the brown duck\",\n",
    "            \"A brown fox jumped on the lazy elephant\",\n",
    "            \"An elephant is eating green grass near the alpaca\",\n",
    "            \"A green alpaca tried to jump over an elephant\",\n",
    "            \"May you rest in a deep and dreamless slumber\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dummy tokenization\n",
    "tokenized_sentences = [sent.strip().split() for sent in sentences]\n",
    "\n",
    "# word to index\n",
    "counter = collections.Counter(itertools.chain(*tokenized_sentences))\n",
    "vocab = counter.most_common()\n",
    "index_to_word = [x[0] for x in vocab]\n",
    "word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true
   },
   "source": [
    "# Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import gensim, logging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "## Train Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T12:10:48.992600",
     "start_time": "2017-04-28T12:10:48.983600"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# parameters\n",
    "size = 200    # size of NN layers, corresponding to word vector dimensionality                      \n",
    "min_count = 1   # minimum word count in order to consider such word                        \n",
    "workers = 4       # number of threads to run in parallel (only effect if you have Cython installed)\n",
    "window = 10          # Context window size                                                                                    \n",
    "downsampling = 1e-3   # Downsample setting for frequent words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T12:12:08.860169",
     "start_time": "2017-04-28T12:12:08.841167"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "print(\"Training model...\")\n",
    "model = gensim.models.Word2Vec([s.split() for s in sentences],\n",
    "                              workers=workers, \n",
    "            size=size, min_count = min_count, \n",
    "            window = window, sample = downsampling)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# If you don't plan to train the model any further, calling \n",
    "# init_sims will make the model much more memory-efficient.\n",
    "model.init_sims(replace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# save model\n",
    "model_name = \"w2v_{}_size{}_mincount{}_window{}\".format(corpus_name, size, min_count, window)\n",
    "model.save(model_name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "heading_collapsed": true,
    "hidden": true
   },
   "source": [
    "## Test Word2Vec Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# load model\n",
    "model = gensim.models.Word2Vec.load(model_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "hidden": true
   },
   "outputs": [],
   "source": [
    "model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)\n",
    "model.doesnt_match(\"breakfast cereal dinner lunch\";.split())\n",
    "model.similarity('woman', 'man')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T12:18:30.365989",
     "start_time": "2017-04-28T12:18:30.358989"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "# sentence to tensor\n",
    "model[['brown', 'fox']].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T14:48:40.821946",
     "start_time": "2017-04-28T14:48:40.817945"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "import itertools\n",
    "import collections\n",
    "nltk.FreqDist(itertools.chain(*[s.split() for s in sentences]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2017-04-28T14:48:53.828690",
     "start_time": "2017-04-28T14:48:53.825689"
    },
    "hidden": true
   },
   "outputs": [],
   "source": [
    "collections.Counter(itertools.chain(*[s.split() for s in sentences]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# GloVe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load GloVe embeddings\n",
    "EMBEDDING_DIM = 100\n",
    "embeddings = {}\n",
    "with open(join(data_folder, \"glove\", \"glove.6B.100d.txt\")) as glove:\n",
    "    for line in glove:\n",
    "        values = line.strip().split()\n",
    "        word = values[0]\n",
    "        vector = np.asarray(values[1:], dtype='float32')\n",
    "        embeddings[word] = vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings['objected']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for i, (k, v) in enumerate(embeddings.items()):\n",
    "    if k=='objected':\n",
    "        print(i)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create embedding matrix\n",
    "embeddings_matrix = np.zeros((len(word_to_index)+1, EMBEDDING_DIM))\n",
    "for word, i in word_to_index.items():\n",
    "    if word in embeddings:\n",
    "        embeddings_matrix[i] = embeddings[word]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:tensor-flow]",
   "language": "python",
   "name": "conda-env-tensor-flow-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.0"
  },
  "toc": {
   "nav_menu": {
    "height": "86px",
    "width": "252px"
   },
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
